from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()">
<input type="submit" value="Click here to toggle on/off the raw code.">
</form>''')
from IPython.display import Image
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as grd
import folium as fol
from sklearn import decomposition, preprocessing
from scipy import stats
from scipy.stats import shapiro, normaltest, anderson, mannwhitneyu
from branca.element import Figure
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio
%matplotlib inline
# Importing Listings And Reviews Data from JOJIE
def load_data(date):
"""Return AirBnb data sets as pandas.DataFrame.
Accesses /mnt/data/public/insideairbnb/data.insideairbnb.com/japan/
kantō/tokyo/ and loads listings and review data sets into
pandas.Dataframe.
Parameters
----------
date : str
Scraping date in yyy-mm-dd format
Returns
-------
"""
listings_raw = pd.read_csv('/mnt/data/public/insideairbnb/'
'data.insideairbnb.com/japan/kantō/tokyo/'
f'{date}/data/listings.csv.gz',
compression='gzip',
low_memory=False,
parse_dates=['last_scraped', 'host_since'])
return listings_raw
# Load Data from JOJIE
# lr = load_data('2021-07-20')
# Feature Selection, Data Cleaning, and Dummy Variable Creation
# fields = ['name',
# 'host_id',
# 'host_name',
# 'last_scraped',
# 'host_since',
# 'host_response_rate',
# 'host_is_superhost',
# 'host_listings_count',
# 'latitude',
# 'longitude',
# 'property_type',
# 'room_type',
# 'accommodates',
# 'bathrooms_text',
# 'bedrooms',
# 'beds',
# 'amenities',
# 'price',
# 'minimum_nights',
# 'maximum_nights',
# 'availability_365',
# 'number_of_reviews',
# 'review_scores_rating',
# 'review_scores_accuracy',
# 'review_scores_cleanliness',
# 'review_scores_checkin',
# 'review_scores_communication',
# 'review_scores_location',
# 'review_scores_value',
# 'reviews_per_month']
# Select fields and drop rows with missing values
# lr_select = lr[fields].dropna()
# Count the number of listed amenities
# lr_select['amenities_ct'] = lr_select['amenities'].str.strip('[]').\
# str.split(',').str.len()
# Calculate number of days as host
# lr_select['days_as_host'] = (lr_select.last_scraped -
# lr_select.host_since).dt.days
# Convert price to float
# lr_select['price'] = lr_select['price'].replace({'\$': '',
# ',': ''},
# regex=True).astype(float)
# Extract bathroom count from bathrooms_text
# def bathrooms(text):
# """Extract number of bathrooms."""
# try:
# bath = float(text.split()[0])
# except:
# if 'half' in text.lower():
# bath = 0.5
# else:
# bath = np.nan
# return bath
# lr_select['bathrooms'] = lr_select['bathrooms_text'].apply(bathrooms)
# Host response rate to float
# lr_select['hrr'] = lr_select['host_response_rate'].replace({'%': ''},
# regex=True)
# lr_select['hrr'] = lr_select['hrr'].astype(float) / 100
# One-hot encoding of categorical variables
# Superhost Tagging
# OH_superhost = pd.get_dummies(lr_select.host_is_superhost,
# prefix='superhost',
# drop_first=True)
# Re-Tag Superhost
# lr_select['host_is_superhost'] = np.where(lr_select.host_is_superhost ==
# 't',
# 'Superhost',
# 'Non-Superhost')
# Drop columns
# lr_select = lr_select.drop(['amenities',
# 'last_scraped',
# 'host_since',
# 'bathrooms_text',
# 'host_response_rate'], axis=1)
# Export to csv file
# lr_select.dropna().to_csv('airbnb20210720.csv')
# OH_superhost.to_csv('OH_superhost20210720.csv')
Superhost, Diagnosed: Exploring the Relationship Between Host Status and Listing Characteristics of AirBnB in Tokyo, Japan
# Photo Source: https://wedc.org/blog/governor-evers-will-lead-trade-mission-
# to-japan-in-september/mt-fuji-and-tokyo-skyline/
Image(filename='Tokyo-Japan-skyline.jpg')
The start up boom has given birth to the shared economy – a system based on people sharing resources or services for a price, and potentially leveraging on the Internet to scale. One of the most successful global examples of this is Airbnb – which has redefined and disrupted the hospitality and real estate industry by allowing hosts to list their properties for guests to book. Airbnb distinguishes their hosts as Superhosts and Non-Superhosts, and this study focuses on Tokyo, Japan and draws data from InsideAirbnb.com to explore the relationships of listing characteristics, and comparing and contrasting them between these types of hosts. Throughout the paper, we apply data visualization, exploratory data analysis, hypothesis testing and dimensionality reduction techniques. After these different approaches, the resulting hypothesis testing suggests that Superhost listings tend to be priced higher, and receive better overall reviews than the Non-Superhost counterparts. Further, it appears that Superhosts’ have been hosting longer and their listings are occupied more frequently compared to Non-Superhosts, based on EDA results. However, PCA does not seem to show any strong differentiation between Superhosts and Non-Superhosts – and it seems the real differentiator for the host types is mostly being able to comply with Airbnb’s Superhost requirements.
From futuristic entertainment to old temples, Japan has always been a hotspot destination for travelers from around the world. In 2019, the Japan Tourism Agency said over 31.9 million overseas tourists visited the country. Tourists can consider staying in capsule hotels or traditional ryokans, but others have found practical value in booking Airbnb accommodations instead for their cost and convenience, particularly in an expensive city like Tokyo. Tourists may also look at several factors, including the host status, which may have been influenced by certain listing characteristics.
What is a Superhost?
Superhosts are experienced hosts who provide a shining example for other hosts, and extraordinary experiences for their guests. Once a host reaches Superhost status, a badge will automatically appear on their listing and profile to help prospective guests identify them. Quarterly reviews are done on all hosts to reward them of this sought-after badge.
Superhost status is awarded to hosts that meet the following requirements:
This study explores the relationships of listing characteristics, and comparing, and contrasting them between Superhosts and Non-Superhosts. We aim to unfold, little by little, the interactions of the Airbnb features and identify which specific characteristics stand out or can be grouped together.
We will apply data visualization, exploratory data analysis, hypothesis testing and dimensionality reduction techniques to scrutinize the data characteristics.
This study consisted of various stages, giving focus on Exploratory Data Analysis (EDA) and dimensionality reduction techniques.
The steps will be described and illustrated in greater detail throughout this report, with the following summary:
csvThrough these, we to glean insights, compare, and contrast Airbnb listings belonging to both Superhosts and Non-Superhosts.
Airbnb
Airbnb started when two hosts welcomed three guests to their San Francisco home in 2007. Friends, Brian Chesky and Joe Gebbia thought of setting up an air mattress in their living room and turn it into a bed and breakfast. Airbed & Breakfast, now popularly known as Airbnb, was established in 2008 as an online marketplace for lodging, homestays for vacation rentals, and tourism activities. Since then it has grown to 4 million Hosts and welcomed more than 900 million guest arrivals all over the world. As of September 2020, Airbnb has 5.6 million listings over 100,000 cities around the globe.
Airbnb started its operations in Japan in 2013 and has grown rapidly since. However, in 2018, a new homesharing law called minpaku required hosts to register and secure permits from the government that caused listings to decrease by up to 80%, from around 60,000. It recovered the following year and raised listings to around 50,000, and additional 23,000 rooms in traditional hospitality categories like hotels and Ryokans. The COVID-19 pandemic has impacted utilization of Airbnb's globally and may slowly recover as the world improves vaccination rates and ease travel restrictions.
InsideAirbnb.com
Inside Airbnb is a watchdog website that scrapes and analyzes publicly available information from Airbnb's website. Founded by Murray Cox, InsideAirbnb provides data as well as insights on how Airbnb competes not just with the hotel industry, but also with the residential housing market.
The data set was extracted from the readily-available raw webscraped information as of 20July2021 from the Airbnb website. This can be accessed via $Jojie$ at /mnt/data/public/insideairbnb/data.insideairbnb.com/japan/kantō/tokyo/. Some fields from the original data set were manipulated to retain numeric values, while some were re-assigned with values (i.e. Superhost and Non-Superhost tagging).
After selecting fields of interest to be used in this study and dropping lines with missing values, 2 data sets were produced:
airbnb20210720.csv as the main data set with 6,617 rows. One of these has a minimum_nights value of 999 and will be dropped in the succeeding sections, but will be added back in the PCA section.OH_superhost20210720.csv which contains the boolean host type tagging (0 for Non-Superhost and 1 for Superhost).The following table details the fields included in this study.
# The data preparation steps have been commented out in this notebook.
# For EDA, and dimensionality reduction, the processed data sets saved as csv
# file will be imported instead.
lr_select = pd.read_csv('airbnb20210720.csv', index_col=0)
OH_superhost = pd.read_csv('OH_superhost20210720.csv', index_col=0)
# Extract sole line with `minimum_nights` = 999
min_nights_outlier = lr_select[(lr_select.minimum_nights >= 999)]
# Drop row with minimum_nights = 999
lr_select = lr_select[(lr_select.minimum_nights < 999)].dropna()
| # | Column | Non-Null Count | Dtype | Description |
|---|---|---|---|---|
| 0 | name | 6616 | object | Listing name |
| 1 | host_id | 6617 | object | Host ID |
| 2 | host_name | 6618 | object | Host Name |
| 3 | host_is_superhost | 6616 | object | Re-tagged host type (Superhost/Non-Superhost) |
| 4 | host_listings_count | 6616 | float64 | Total listings under the same host |
| 5 | latitude | 6616 | float64 | Latitude coordinate |
| 6 | longitude | 6616 | float64 | Longitude coordinate |
| 7 | property_type | 6616 | object | Property Type |
| 8 | room_type | 6616 | object | Room Type |
| 9 | accommodates | 6616 | int64 | Capacity (per number of pax) of listing |
| 10 | bedrooms | 6616 | float64 | Count of bedrooms in listing |
| 11 | beds | 6616 | float64 | Count of beds in listing |
| 12 | price | 6616 | float64 | Price of listing in Japanese Yean |
| 13 | minimum_nights | 6616 | int64 | Minimum number nights per booking |
| 14 | maximum_nights | 6616 | int64 | Maximum number of nights per booking |
| 15 | availability_365 | 6616 | int64 | Listing availability in the last 365 days |
| 16 | number_of_reviews | 6616 | int64 | Number of customer reviews |
| 17 | review_scores_rating | 6616 | float64 | Overall Experience Rating (0 to 5, continuous) |
| 18 | review_scores_accuracy | 6616 | float64 | Accuracy Rating (0 to 5, continuous) |
| 19 | review_scores_cleanliness | 6616 | float64 | Cleanliness Rating (0 to 5, continuous) |
| 20 | review_scores_checkin | 6616 | float64 | Check-in Process Rating (0 to 5, continuous) |
| 21 | review_scores_communication | 6616 | float64 | Host Communication Rating (0 to 5, continuous) |
| 22 | review_scores_location | 6616 | float64 | Location Rating (0 to 5, continuous) |
| 23 | review_scores_value | 6616 | float64 | Value Rating (0 to 5, continuous) |
| 24 | reviews_per_month | 6616 | float64 | Number of customer reviews per month |
| 25 | amenities_ct | 6616 | int64 | Count of listed amenities |
| 26 | days_as_host | 6616 | int64 | Number of days as host as of scraping date |
| 27 | bathrooms | 6616 | float64 | Number of bathrooms in listing |
| 28 | hrr | 6616 | float64 | Host Response Rate |
Figure 1. Tokyo AirBnb Map by Host Type
# Geo Map of Listings
fig3 = Figure(width=950, height=550)
# Takeshita Street in Shinjuku as Center
m3 = fol.Map(location=[35.6713, 139.7048],
tiles='cartodbpositron',
zoom_start=11,
prefer_canvas=True)
# Landmarks
markers = [('Tokyo Imperial Palace', [35.6838, 139.7507], 'flag'),
('Tokyo Station', [35.6814, 139.7661], 'star-empty'),
('Tokyo Tower', [35.6586, 139.7454], 'tower'),
('Tokyo Skytree', [35.7101, 139.8107], 'camera'),
('Senso-ji Temple', [35.7148, 139.7967], 'camera'),
('Takeshita Street', [35.6713, 139.7048], 'camera'),
('Shinjuku Gyoen National Garden', [35.6852, 139.7101], 'camera'),
('Ikebukuro Station', [35.7295, 139.7109], 'star-empty'),
('Tokyo Metropolitan Government Building', [35.6896, 139.6921],
'flag'),
('Shibuya Station', [35.6580, 139.7016], 'star-empty'),
('Don Quixote Akihabara', [35.7008, 139.7718], 'shopping-cart'),
('Tokyo Haneda Airport', [35.5494, 139.7798], 'plane')]
# Plotting the landmark markers onto the map
for i in markers:
fol.Marker(location=i[1],
popup=i[0],
tooltip=i[0],
icon=fol.Icon(color='darkblue',
icon=i[2])).add_to(m3)
fig3.add_child(m3)
superhost = dict()
# For map ticker
for s in pd.unique(lr_select['host_is_superhost']):
superhost[s] = fol.FeatureGroup(name=s)
# Mean and standard deviation of price - for outlier markers
price_mean = lr_select.price.mean(axis=0)
price_std = lr_select.price.std(axis=0)
# Plotting the data points on the map
for i, r in lr_select.iterrows():
if r.host_is_superhost == 'Superhost':
if r.price > price_mean + 3 * price_std:
c = 'magenta'
else:
c = 'red'
elif r.host_is_superhost == 'Non-Superhost':
if r.price > price_mean + 3 * price_std:
c = 'cyan'
else:
c = 'blue'
else:
c = 'green'
if r.price > price_mean + 3 * price_std:
rad = 25
else:
rad = 15
popup_string = ('NAME: ' + r['name'] +
f'\nHOST: {r.host_name}\nHOST ID: {r.host_id}')
m = fol.Circle([r.latitude, r.longitude],
radius=rad,
color=c,
fill_color=c,
popup=popup_string,
tooltip='¥{:,.2f}'.format(r['price']))
m.add_to(superhost[r.host_is_superhost])
for s in pd.unique(lr_select['host_is_superhost']):
superhost[s].add_to(m3)
fol.LayerControl().add_to(m3)
m3